# a real world data set # # I went out and did some "research" to find the EPA highway # mpg and the curb weight (in pounds) of a number of SUV's. # # I placed that information into a comma separated file, # curb_mpg.csv. Within the context of this course, we # could use the c() function to just get that information into R. # That is done in the following lines. highway_mpg <- c(31, 31, 31, 26, 27, 28, 25, 20, 21, 33, 19, 27, 31, 17) curb_weight <- c(3124, 3325, 2996, 4253, 4123, 3665, 4362, 5473,5808, 3490, 4750, 4370, 3300, 5730) SUVs <- c("Trax", "Equinox sml", "Trailblazer", "Blazer sml", "Blazer lrg", "Equinox lrg","Traverse", "Tahoe","Suburban", "Rav4", "4Runner", "Highlander", "C-HR", "Sequoia") # # Then, we might look at the relation between curb weight and mpg. # First, we will create a plot of the two values. plot(curb_weight,highway_mpg, main = "Sample of some SUV curb weight vs highway mpg", xlim=c(2500,6000), ylim=c(18,34), xaxp=c(2500,6000,7), yaxp=c(18,34,8), las=2) abline( h=seq(18,34,2), v=seq(2500,6000,500), col="darkgrey",lty="dotted") # One thing to note is that the mpg is given in whole numbers. # That forces a lot of values onto the same horizontal level. # The graph suggests a linear relationship where as the curb weight # goes up, the mpg goes down. We can look at the strength of that # relationship by finding the correlation coefficient. cor( curb_weight,highway_mpg) # That answer, -0.9300396, is pretty strong. If we look # at the square of that value it will tell us the percent of the # variation shown in the data that is explained by the linear # model (even though we, at this point, have not found that model). (-0.9300396)^2 # From this we see that over 86% of the variablility is explained # by the best linear model. So, let us find the coefficients, # a and b, for the equation of the line, y= a + b*x, that is the # best fit for our data. Note that means we will have # mpg = a + b * curb_weight # # We will call our model "ratio" ratio <- lm( highway_mpg ~ curb_weight ) ratio # This tells us that the line of best fit looks like # mpg = 47.28854 + (-0.00502) * curb_weight # We can graph that line on our plot, and do it in red, with # a line width=2 abline( ratio, col="red", lwd=2) # from this we can predict the mpg for a vehicle that # has a curb weight of 5000 pounds via 47.28854 + (-0.00502) * 5000 # knowing that the predicted value is 22.18854 we could even # plot that point, (5000, 22.18854) on our graph via points( 5000, 22.18854, col="blue", pch=22 ) # Because 5000 is within the range of observed values for # our independent variable, the x-variable, curb_weight, this # is an interpolation. We can feel pretty confident of this # value. # In the same way, we could predict the mpg for a # vehicle that weighs 7500 via 47.28854 + (-0.00502) * 7500 # that value, 9.63854, is interesting but we should not # have a lot of confidence in it because the independent # value, 7500, is not within the observed range of our data. # That makes this an extrapolation and we should be hesitant # to suggest that such a value is accurate. # one other thing that we should check is the distribution # of the residual values. The long way to find all of those # residual values is to first find all of the expected values. ev_s <- 47.28854 + (-0.00502) * curb_weight ev_s # We could even add all of those values, which will be on our # line of best fit, via points( curb_weight, ev_s, col="green", pch=15) # But, what we really want is the residual values, that is, # the observed - expected values resid_vals <- highway_mpg - ev_s resid_vals # Then we want to see the scatter plot of the # curb weight and the residuals # That will be a new graph plot( curb_weight, resid_vals ) # We will be satisfied if the points are all over the # place, not forming any particular pattern. # Note that because we saved the linear model in "ratio" # we could have found the same residual values via residuals( ratio ) # the results will be slightly different # because our earlier calculation used the # rounded values for a and b, the # intercept and the slope. # Or, we could have produced our graph via plot( curb_weight, residuals( ratio ), main="New residual plot") # A bit more than is required in this class, we could # have put the data into a data frame so that we could get a # slightly better listing of the data our_data <- data.frame( SUVs, curb_weight, highway_mpg) our_data # or as View( our_data ) # be sure to not the capital V # Also beyond the scope of this course, we could have just # read the data from the file right into a data frame new_data <- read.csv("curb_mpg.csv") new_data # Then we can reference the separate columns in the data frame # by using the data frame name followed by a $ followed by the # column name. new_data$curb_weight ##################################### # We have seen an example of real world data. And, for that # data we had to enter the data via lines 10-18 above, or # by reading the data from the file, line 129 above. # At times I have students ask why I use gnrnd4 to just # generate random data rather than using real world data. # The answer is three fold. First, via gnrnd4 I can get # newly generated data for each example (that includes # test questions). Second, just running gnrnd4 puts the # data into variables, L1 and L2, in our RStudio session. # Third, via gnrnd4, or gnrnd5, I can mimic just about # any real world data. Look at the following: source("../gnrnd5.R") gnrnd5( 23405001406, 4399456001200,2900002950) plot( L1, L2, xlim=c(2500,6000), ylim=c(18,34), xaxp=c(2500,6000,7), yaxp=c(18,34,8), las=2) abline( h=seq(18,34,2), v=seq(2500,6000,500), col="darkgrey",lty="dotted") cor( L1,L2) new_ratio <- lm(L2~L1) new_ratio abline( new_ratio, col="red") # That is hardly any different from the real world data above. ################################## # Having said that, let us take a look at two more cases. # First generate the data source( "../gnrnd4.R") gnrnd4( 1770222306, 4840717035, 35700076) # look at our values L1 L2 # or using the little extra that we saw above data.frame(L1,L2) # Then, get a first, ugly plot of the data plot( L1,L2) # Now, to be fancy, improve the plot plot( L1, L2, main=" our new example ", xlim=c(-10,50), ylim=c(-10,100), las=1, xaxp=c(-10,50,12), yaxp=c(-10,100,22), cex.axis=0.7) # add some light horizontal and vertical lines abline( h=seq(-10,100, 5), v=seq(-10,50,5), col="darkgray", lty="dotted") # add the axes abline( h=0, v=0, col="black") # A linear model looks appropriate, find the correlation # coefficient cor( L1, L2 ) # Just for our reference, square that cor( L1, L2 )^2 # this will make poor R redo the computation # Now we will get the linear model, i.e., the coefficients # We will store the model in lm_2 lm_2 <- lm( L2 ~ L1 ) # Look at the coefficients lm_2 # so our model is L2 = -5.840 + 2.138*L1 # We will graph that line, in purple abline( lm_2, col="purple", lwd=2) # # Use the regression equation to predict the value of L2 if # L1 is 28.3. -5.840 + 2.138*28.3 # Graph that point points( 28.3, 54.6654, col="darkorange", pch=13) # note that this is an interpolation because 28.3 is # within the observed range of L1. # Find the residual value when L1 is 38.1 # We note that 38.1 is the 6th value in L1, the corresponding # L2 value, the one in position 6, is 82.0. That is the observed # value. To get the expected value when L1 is 38.1 we compute # -5.840 + 2.138*38.1 # So the residual value will be the observed - the expected 82.0 - 75.6178 # of course, knowing that the ordered pair that we want # to use is the 6th ordered pair, we could have just done residuals( lm_2 ) # and found the 6th answer # again, the difference is due to the rounding error for # our coefficients. We could have found better values # for the intercept and slope via coeffs <- coefficients( lm_2 ) coeffs # These are still rounded, but now to 6 decimal places # therefore the expected value when L1 is 38.1 becomes -5.839503 + 2.137889 * 38.1 # And the residual is now 82.0 - 75.61407 # or we could go a bit more crazy and use the full # 16 digit value for the coefficients to get the # expected value ev_2 <- coeffs[1] + coeffs[2]*38.1 ev_2 # and now the full residual 82.0 - ev_2 # Finally, check the distribution of the residuals plot( L1, residuals( lm_2) ) # and now a new example gnrnd4( 6513562106, 7166322108, 88700176) # look at our values L1 L2 # or using the little extra that we saw above data.frame(L1,L2) # Then, get a first, ugly plot of the data plot( L1,L2) summary(L1) summary(L2) # Now, to be fancy, improve the plot plot( L1, L2, main=" our new example ", xlim=c(-10,70), ylim=c(-160,160), las=1, xaxp=c(-10,70,16), yaxp=c(-160,160,16), cex.axis=0.7) # add some light horizontal and vertical lines abline( h=seq(-160,160, 20), v=seq(-10,70,5), col="darkgray", lty="dotted") # add the axes abline( h=0, v=0, col="black") # find the correlation coefficient cc <- cor( L1, L2 ) cc # look at the square of the correlation coefficient cc^2 # this time R did not have to recompute # not such a good value # find the best line best_line <- lm( L2 ~ L1 ) best_line # plot the best line abline( best_line ) # use the full equation to predict L2 when L1=20.3 coeffs <- coefficients( best_line ) coeffs coeffs[1]+coeffs[2]*20.3 # just to be sure, plot that point points( 20.3, 43.17139, col="red", pch=17) # Look at the distribution of the residuals plot( L1, residuals( best_line ))